{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "from feature_engine.wrappers import SklearnTransformerWrapper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>MSZoning</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>Street</th>\n",
       "      <th>Alley</th>\n",
       "      <th>LotShape</th>\n",
       "      <th>LandContour</th>\n",
       "      <th>Utilities</th>\n",
       "      <th>...</th>\n",
       "      <th>PoolArea</th>\n",
       "      <th>PoolQC</th>\n",
       "      <th>Fence</th>\n",
       "      <th>MiscFeature</th>\n",
       "      <th>MiscVal</th>\n",
       "      <th>MoSold</th>\n",
       "      <th>YrSold</th>\n",
       "      <th>SaleType</th>\n",
       "      <th>SaleCondition</th>\n",
       "      <th>SalePrice</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>65.0</td>\n",
       "      <td>8450</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>208500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>20</td>\n",
       "      <td>RL</td>\n",
       "      <td>80.0</td>\n",
       "      <td>9600</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>2007</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>181500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>68.0</td>\n",
       "      <td>11250</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>223500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>70</td>\n",
       "      <td>RL</td>\n",
       "      <td>60.0</td>\n",
       "      <td>9550</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2006</td>\n",
       "      <td>WD</td>\n",
       "      <td>Abnorml</td>\n",
       "      <td>140000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>84.0</td>\n",
       "      <td>14260</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>250000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 81 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \\\n",
       "0   1          60       RL         65.0     8450   Pave   NaN      Reg   \n",
       "1   2          20       RL         80.0     9600   Pave   NaN      Reg   \n",
       "2   3          60       RL         68.0    11250   Pave   NaN      IR1   \n",
       "3   4          70       RL         60.0     9550   Pave   NaN      IR1   \n",
       "4   5          60       RL         84.0    14260   Pave   NaN      IR1   \n",
       "\n",
       "  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \\\n",
       "0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   \n",
       "1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   \n",
       "2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   \n",
       "3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   \n",
       "4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   \n",
       "\n",
       "  YrSold  SaleType  SaleCondition  SalePrice  \n",
       "0   2008        WD         Normal     208500  \n",
       "1   2007        WD         Normal     181500  \n",
       "2   2008        WD         Normal     223500  \n",
       "3   2006        WD        Abnorml     140000  \n",
       "4   2008        WD         Normal     250000  \n",
       "\n",
       "[5 rows x 81 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.read_csv('houseprice.csv')\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1022, 79), (438, 79))"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# let's separate into training and testing set\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    data.drop(['Id', 'SalePrice'], axis=1), data['SalePrice'], test_size=0.3, random_state=0)\n",
    "\n",
    "X_train.shape, X_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Scaling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['MSSubClass',\n",
       " 'LotFrontage',\n",
       " 'LotArea',\n",
       " 'OverallQual',\n",
       " 'OverallCond',\n",
       " 'YearBuilt',\n",
       " 'YearRemodAdd',\n",
       " 'MasVnrArea',\n",
       " 'BsmtFinSF1',\n",
       " 'BsmtFinSF2',\n",
       " 'BsmtUnfSF',\n",
       " 'TotalBsmtSF',\n",
       " '1stFlrSF',\n",
       " '2ndFlrSF',\n",
       " 'LowQualFinSF',\n",
       " 'GrLivArea',\n",
       " 'BsmtFullBath',\n",
       " 'BsmtHalfBath',\n",
       " 'FullBath',\n",
       " 'HalfBath',\n",
       " 'BedroomAbvGr',\n",
       " 'KitchenAbvGr',\n",
       " 'TotRmsAbvGrd',\n",
       " 'Fireplaces',\n",
       " 'GarageYrBlt',\n",
       " 'GarageCars',\n",
       " 'GarageArea',\n",
       " 'WoodDeckSF',\n",
       " 'OpenPorchSF',\n",
       " 'EnclosedPorch',\n",
       " '3SsnPorch',\n",
       " 'ScreenPorch',\n",
       " 'PoolArea',\n",
       " 'MiscVal',\n",
       " 'MoSold',\n",
       " 'YrSold']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cols = [var for var in X_train.columns if X_train[var].dtypes !='O']\n",
    "\n",
    "cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SklearnTransformerWrapper(transformer=StandardScaler(),\n",
       "                          variables=['MSSubClass', 'LotFrontage', 'LotArea',\n",
       "                                     'OverallQual', 'OverallCond', 'YearBuilt',\n",
       "                                     'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',\n",
       "                                     'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',\n",
       "                                     '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',\n",
       "                                     'GrLivArea', 'BsmtFullBath',\n",
       "                                     'BsmtHalfBath', 'FullBath', 'HalfBath',\n",
       "                                     'BedroomAbvGr', 'KitchenAbvGr',\n",
       "                                     'TotRmsAbvGrd', 'Fireplaces',\n",
       "                                     'GarageYrBlt', 'GarageCars', 'GarageArea',\n",
       "                                     'WoodDeckSF', 'OpenPorchSF',\n",
       "                                     'EnclosedPorch', ...])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# let's apply the standard scaler on the above variables\n",
    "\n",
    "scaler = SklearnTransformerWrapper(transformer = StandardScaler(),\n",
    "                                    variables = cols)\n",
    "\n",
    "scaler.fit(X_train.fillna(0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = scaler.transform(X_train.fillna(0))\n",
    "X_test = scaler.transform(X_test.fillna(0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([5.66144814e+01, 5.67847358e+01, 1.05679667e+04, 6.07925636e+00,\n",
       "       5.56262231e+00, 1.97094031e+03, 1.98469863e+03, 1.03046967e+02,\n",
       "       4.42224070e+02, 4.71272016e+01, 5.65992172e+02, 1.05534344e+03,\n",
       "       1.16172211e+03, 3.54725049e+02, 5.69080235e+00, 1.52213796e+03,\n",
       "       4.18786693e-01, 5.67514677e-02, 1.57632094e+00, 3.82583170e-01,\n",
       "       2.89432485e+00, 1.04500978e+00, 6.54892368e+00, 6.12524462e-01,\n",
       "       1.87349902e+03, 1.76418787e+00, 4.69398239e+02, 9.48522505e+01,\n",
       "       4.73786693e+01, 2.36076321e+01, 3.32583170e+00, 1.56467710e+01,\n",
       "       1.78669276e+00, 5.58649706e+01, 6.30039139e+00, 2.00783953e+03])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# mean values, learnt by the StandardScaler\n",
    "scaler.transformer_.mean_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([4.21398764e+01, 3.41348001e+01, 1.02862405e+04, 1.35722328e+00,\n",
       "       1.11803056e+00, 3.02275350e+01, 2.04957916e+01, 1.81223904e+02,\n",
       "       4.29715456e+02, 1.57876396e+02, 4.32828598e+02, 4.10034673e+02,\n",
       "       3.73803122e+02, 4.40622334e+02, 4.77726012e+01, 5.17557055e+02,\n",
       "       5.14714307e-01, 2.35558243e-01, 5.41385993e-01, 4.97950772e-01,\n",
       "       8.02751557e-01, 2.21031096e-01, 1.61483273e+00, 6.35357186e-01,\n",
       "       4.43166351e+02, 7.33707095e-01, 2.08708471e+02, 1.28334288e+02,\n",
       "       6.75958872e+01, 6.42442872e+01, 2.87183246e+01, 5.62094688e+01,\n",
       "       3.33296043e+01, 5.87014705e+02, 2.70839455e+00, 1.34094155e+00])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# std values, learnt by the StandardScaler\n",
    "scaler.transformer_.scale_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MSSubClass       9.038215e-17\n",
       "LotFrontage      5.040543e-17\n",
       "LotArea         -4.953637e-17\n",
       "OverallQual     -3.024326e-16\n",
       "OverallCond     -3.476236e-17\n",
       "YearBuilt        1.360947e-15\n",
       "YearRemodAdd    -3.199007e-15\n",
       "MasVnrArea      -2.954801e-17\n",
       "BsmtFinSF1       5.996508e-17\n",
       "BsmtFinSF2      -1.738118e-18\n",
       "BsmtUnfSF       -6.952473e-18\n",
       "TotalBsmtSF     -2.780989e-17\n",
       "1stFlrSF         1.355732e-16\n",
       "2ndFlrSF        -5.561978e-17\n",
       "LowQualFinSF    -4.953637e-17\n",
       "GrLivArea       -7.126285e-17\n",
       "BsmtFullBath     3.823860e-17\n",
       "BsmtHalfBath     5.561978e-17\n",
       "FullBath        -1.199302e-16\n",
       "HalfBath         5.214355e-18\n",
       "BedroomAbvGr     1.112396e-16\n",
       "KitchenAbvGr    -3.319806e-16\n",
       "TotRmsAbvGrd    -2.468128e-16\n",
       "Fireplaces      -2.433366e-17\n",
       "GarageYrBlt     -2.085742e-17\n",
       "GarageCars      -9.038215e-17\n",
       "GarageArea      -1.738118e-17\n",
       "WoodDeckSF      -8.690591e-19\n",
       "OpenPorchSF     -2.259554e-17\n",
       "EnclosedPorch   -2.172648e-17\n",
       "3SsnPorch        1.738118e-18\n",
       "ScreenPorch      1.738118e-17\n",
       "PoolArea        -3.302425e-17\n",
       "MiscVal         -8.256062e-18\n",
       "MoSold          -1.251445e-16\n",
       "YrSold          -4.583070e-14\n",
       "dtype: float64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# the mean of the scaled variables is 0\n",
    "X_train[cols].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MSSubClass       1.00049\n",
       "LotFrontage      1.00049\n",
       "LotArea          1.00049\n",
       "OverallQual      1.00049\n",
       "OverallCond      1.00049\n",
       "YearBuilt        1.00049\n",
       "YearRemodAdd     1.00049\n",
       "MasVnrArea       1.00049\n",
       "BsmtFinSF1       1.00049\n",
       "BsmtFinSF2       1.00049\n",
       "BsmtUnfSF        1.00049\n",
       "TotalBsmtSF      1.00049\n",
       "1stFlrSF         1.00049\n",
       "2ndFlrSF         1.00049\n",
       "LowQualFinSF     1.00049\n",
       "GrLivArea        1.00049\n",
       "BsmtFullBath     1.00049\n",
       "BsmtHalfBath     1.00049\n",
       "FullBath         1.00049\n",
       "HalfBath         1.00049\n",
       "BedroomAbvGr     1.00049\n",
       "KitchenAbvGr     1.00049\n",
       "TotRmsAbvGrd     1.00049\n",
       "Fireplaces       1.00049\n",
       "GarageYrBlt      1.00049\n",
       "GarageCars       1.00049\n",
       "GarageArea       1.00049\n",
       "WoodDeckSF       1.00049\n",
       "OpenPorchSF      1.00049\n",
       "EnclosedPorch    1.00049\n",
       "3SsnPorch        1.00049\n",
       "ScreenPorch      1.00049\n",
       "PoolArea         1.00049\n",
       "MiscVal          1.00049\n",
       "MoSold           1.00049\n",
       "YrSold           1.00049\n",
       "dtype: float64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# the std of the scaled variables is ~1\n",
    "\n",
    "X_train[cols].std()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "fenotebook",
   "language": "python",
   "name": "fenotebook"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
